---
title: "EMOCON (CogSci2022)"
publication: Kwon, M., Wager, T., & Phillips, J. (2022). Representations of emotion concepts: Comparison across pairwise, appraisal feature-based, and word embedding-based similarity spaces. Proceedings of the Annual Meeting of the Cognitive Science Society, 44(44). https://escholarship.org/uc/item/8vj3d366
author: "Mijin Kwon, Tor D. Wager, & Jonathan Phillips"
updated: "12/07/2022"
output:
  html_document: default
  pdf_document: default
---

# Libraries and other set-ups (#fin_checked)

```{r setup, include=FALSE}

## Libraries (#check -- add install.packages() & change in summarized format; maybe example from Mark's script?)

#rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
library(plyr)
library(lme4)
library(tidyverse)
library(knitr)
library(ggplot2)
library(ggrepel)
library(word2vec)
library(Hmisc)
library(magrittr)
library(dplyr)
library(ggpubr)
library(reshape2)
library(olsrr)
library(RColorBrewer)
library(lares)
library(factoextra)
library(FactoMineR)
library(gdata)
library(scales)
library(xtable)
library(pracma)
library(graph4lg)

# Other set-ups (e.g., functions, color panels)

## Directory 
workdir = getwd() # OR path to directory where you downloaded from dataverse

## Functions

### to change symmatric matrix to edge list (#check -- is this function used?)
gen.mat.to.edge.list<-function(mat,symmetric=TRUE,diagonal=TRUE,text=FALSE){
  #create edge list from matrix
  # if symmetric duplicates are removed
  mat<-as.matrix(mat)
  id<-is.na(mat) # used to allow missing
  mat[id]<-"nna"
  if(symmetric){mat[lower.tri(mat)]<-"na"} # use to allow missing values
  if(!diagonal){diag(mat)<-"na"}
  obj<-melt(mat)
  colnames(obj)<-c("source","target","value")
  obj<-obj[!obj$value=="na",]
  obj$value[obj$value=="nna"]<-NA
  if(!text){obj$value<-as.numeric(as.character(obj$value))}
  return(obj)
  # remove duplicates
}

# Emotion concept list
emotionConcept_fullList = c("amusement","anger","annoyance","anxiety","apprehension","awe","awkwardness","boredom","calmness",
                            "comfortableness","confusion","contempt","contentment","despair","desperation","devastation","disappointment",
                            "discomfortableness","disgust","elation","embarrassment","envy","exasperation","excitement","fear","frustration",
                            "fury","gratefulness","guilt","happiness","hope","hopelessness","horror","hurt","inspiration","jealousy","joy",
                            "longing","love","nostalgia","peace","pride","protectiveness","regret","relaxedness","relief","resentment",
                            "romanticness","sadness","satisfaction","sereneness","shame","surprise","sympathy","terror","triumph",
                            "unhappiness","worry")
```

# Data: pairwise similairy data (#fin_checked)

```{r dat_pairwise, echo=FALSE}

# Data: Pairwise similarity rating

## Load data: original feature ratings
d_pairwise_raw <- read_csv(file.path(workdir, "analysis/emoconData_pairwise.csv"))

# Calculating mean
d_pairwise_dist <- d_pairwise_raw %>%
  group_by(emotionPair) %>%
  dplyr::summarize(rating_mean = mean(rating, na.rm=TRUE))
```

# Data: 14 appraisal feature ratings & feature-based similarity data (#fin_checked)

```{r dat_feature, echo=FALSE}

# Data: Appraisal feature rating

## Load data: original feature ratings
d_appraisal_raw <- read_csv(file.path(workdir, "emoconData_appraisals.csv"))

## Calculating distance (euclidean distance)

### converting to edge list
d_appraisal_tmp1 <- as.matrix(dist(d_appraisal_raw)) # convert to matrix first
colnames(d_appraisal_tmp1) <- d_appraisal_raw$emotionConcept
rownames(d_appraisal_tmp1) <- d_appraisal_raw$emotionConcept

d_appraisal_tmp2 <- gen.mat.to.edge.list(d_appraisal_tmp1) # convert to edge list
colnames(d_appraisal_tmp2) <- c("emotion1", "emotion2", "distance")
d_appraisal_tmp2 <- as_tibble(d_appraisal_tmp2)

## adding emotion pair column
d_appraisal_tmp3 <- d_appraisal_tmp2[order(d_appraisal_tmp2$emotion2),]
d_appraisal_tmp3 <- d_appraisal_tmp2[order(d_appraisal_tmp2$emotion1),]
d_appraisal_tmp3 <- d_appraisal_tmp3 %>% 
  unite(emotionPair, c(emotion1, emotion2), sep = "_", remove = FALSE)

d_appraisal_dist <- dplyr::select(d_appraisal_tmp3, emotionPair, distance) # *** Study 3 similarity data in edge format (***dissimilarity)
# View(d_appraisal_dist) 

# Scaled data for difference score and PCA
d_appraisal_scaled <- d_appraisal_raw 
d_appraisal_scaled[ , -1] <- scale(d_appraisal_scaled[ , -1]) # scale the data
```

# Data: NLP model 1 (word2vec trained on Google news) & word embedding similarity data (cosine similarity) --> add script from EMOCON_survey4_word2vec (#fin_checked)

```{r dat_nlp2_w2v_gnews,echo=FALSE}

# Data info (#TODO: change the information below)
## Reference: https://code.google.com/archive/p/word2vec/
## Google news (GoogleNews-vectors-negative300.bin) 
## The following terms were replaced with an alternative (in parentheses) due to their absence in the models:
### discomfortableness (uncomfortableness)
### relaxedness (relaxed)
### romanticness (romantic)
### sereneness (serene)

# Data: Word embeddings (Google news)

## Load data: original feature ratings
d_wordEm_gnews_raw <- read_csv(file.path(workdir, "analysis/emoconData_wordEm_gNews.csv"))
d_wordEm_gnews_raw_re <- d_wordEm_gnews_raw
d_wordEm_gnews_raw_re <- d_wordEm_gnews_raw_re %>% 
  remove_rownames %>% tibble::column_to_rownames(var="emotionConcept")
d_wordEm_gnews_raw_re <- as.matrix(d_wordEm_gnews_raw_re)

## Calculating distance (cosine similarity)
d_wordEm_gnews_tmp1 <- word2vec_similarity(d_wordEm_gnews_raw_re, d_wordEm_gnews_raw_re, type = "cosine") # *** embedding-based similarity data (gnews) in matrix (***similarity)

### converting to edge list
d_wordEm_gnews_tmp2 <- gen.mat.to.edge.list(d_wordEm_gnews_tmp1)

colnames(d_wordEm_gnews_tmp2) <- c("emotion1", "emotion2", "wordVec")
d_wordEm_gnews_tmp2 <- as_tibble(d_wordEm_gnews_tmp2)

## making emotion pair column
d_wordEm_gnews_tmp3 <- d_wordEm_gnews_tmp2[order(d_wordEm_gnews_tmp2$emotion2),]
d_wordEm_gnews_tmp3 <- d_wordEm_gnews_tmp3[order(d_wordEm_gnews_tmp3$emotion1),]

d_wordEm_gnews_tmp3 <- d_wordEm_gnews_tmp3 %>% 
  unite(emotionPair, c(emotion1, emotion2), sep = "_", remove = FALSE)

d_wordEm_gnews_dist <- dplyr::select(d_wordEm_gnews_tmp3, emotionPair, wordVec) # *** embedding-based similarity data (gnews) in edgelist (***similarity))
View(d_wordEm_gnews_dist)
```

# Data: NLP model 2 (word2vec trained on Wikipedia) & word embedding similarity data (cosine similarity) (#fin_checked)

```{r dat_nlp2_w2v_wiki,echo=FALSE}

# Data info
## Reference: http://vectors.nlpl.eu/explore/embeddings/en/models/
## English Wikipedia (enwiki_upos_skipgram_300_3_2019) 
## The following terms were replaced with an alternative (in parentheses) due to their absence in the models:
### comfortableness (comfortable)
### discomfortableness (uncomfortable)
### gratefulness (grateful)
### relaxedness (relaxed)
### romanticness (romantic)
### sereneness (serene)
### protectiveness (protective)

# Data: Word embeddings (Wikipedia)

## Load data: original feature ratings
d_wordEm_wiki_raw <- read_csv("/Users/mijinkwon_macpro/Documents/projects/emotom/emotom_r/emotom/analysis/emoconData_wordEm_wiki.csv")
d_wordEm_wiki_raw_re <- d_wordEm_wiki_raw
d_wordEm_wiki_raw_re <- d_wordEm_wiki_raw_re %>% 
  remove_rownames %>% tibble::column_to_rownames(var="emotionConcept")
d_wordEm_wiki_raw_re <- as.matrix(d_wordEm_wiki_raw_re)

## Calculating distance (cosine similarity)
d_wordEm_wiki_tmp1 <- word2vec_similarity(d_wordEm_wiki_raw_re, d_wordEm_wiki_raw_re, type = "cosine") # *** embedding-based similarity data (gnews) in matrix (***similarity)

### converting to edge list
d_wordEm_wiki_tmp2 <- gen.mat.to.edge.list(d_wordEm_wiki_tmp1)

colnames(d_wordEm_wiki_tmp2) <- c("emotion1", "emotion2", "wordVec")
d_wordEm_wiki_tmp2 <- as_tibble(d_wordEm_wiki_tmp2)

## making emotion pair column
d_wordEm_wiki_tmp3 <- d_wordEm_wiki_tmp2[order(d_wordEm_wiki_tmp2$emotion2),]
d_wordEm_wiki_tmp3 <- d_wordEm_wiki_tmp3[order(d_wordEm_wiki_tmp3$emotion1),]

d_wordEm_wiki_tmp3 <- d_wordEm_wiki_tmp3 %>% 
  unite(emotionPair, c(emotion1, emotion2), sep = "_", remove = FALSE)

d_wordEm_wiki_dist <- dplyr::select(d_wordEm_wiki_tmp3, emotionPair, wordVec) # *** embedding-based similarity data (gnews) in edgelist (***similarity))
# View(d_wordEm_wiki_dist)
```

# Data: NLP model 3 (GPT3 & word embedding similarity data (cosine similarity) (#fin_checked)

```{r dat_nlp2_w2v_gpt3,echo=FALSE}

## Data info
### gpt3 Davinci
### reference: openai.io

# Data: Word embeddings (GPT3)

## Load data: original feature ratings
d_wordEm_gpt3_raw <- read_csv(file.path(workdir, "analysis/emoconData_wordEm_gpt3.csv"))
d_wordEm_gpt3_raw_re <- d_wordEm_gpt3_raw
d_wordEm_gpt3_raw_re <- d_wordEm_gpt3_raw_re %>% 
  remove_rownames %>% tibble::column_to_rownames(var="emotionConcept")
d_wordEm_gpt3_raw_re <- as.matrix(d_wordEm_gpt3_raw_re)

## Calculating distance (cosine similarity)
d_wordEm_gpt3_tmp1 <- word2vec_similarity(d_wordEm_gpt3_raw_re, d_wordEm_gpt3_raw_re, type = "cosine") # *** embedding-based similarity data (gnews) in matrix (***similarity)

### converting to edge list
d_wordEm_gpt3_tmp2 <- gen.mat.to.edge.list(d_wordEm_gpt3_tmp1)

colnames(d_wordEm_gpt3_tmp2) <- c("emotion1", "emotion2", "wordVec")
d_wordEm_gpt3_tmp2 <- as_tibble(d_wordEm_gpt3_tmp2)

## making emotion pair column
d_wordEm_gpt3_tmp3 <- d_wordEm_gpt3_tmp2[order(d_wordEm_gpt3_tmp2$emotion2),]
d_wordEm_gpt3_tmp3 <- d_wordEm_gpt3_tmp3[order(d_wordEm_gpt3_tmp3$emotion1),]

d_wordEm_gpt3_tmp3 <- d_wordEm_gpt3_tmp3 %>% 
  unite(emotionPair, c(emotion1, emotion2), sep = "_", remove = FALSE)

d_wordEm_gpt3_dist <- dplyr::select(d_wordEm_gpt3_tmp3, emotionPair, wordVec) # *** embedding-based similarity data (gnews) in edgelist (***similarity))
# View(d_wordEm_gpt3_dist)

```

# Data: combined dataset (#fin_checked)

``` {r rsa_data_combined,echo=FALSE}
## Combining data ("similarity" data, not "dissimilarity")

### pairwise similarity: raw rating
pair_sim <- d_pairwise_dist # similarity
pair_sim$rating_mean <- rescale(pair_sim$rating_mean) # rescale between 0 and 1

### feature-based similarity: euclidean distance
feature_sim <- d_appraisal_dist # dissimilarity
feature_sim$distance <- 1 - rescale(feature_sim$distance) # rescale between 0 and 1

### word embedding-based similarity - 1 (word2vec, gnews) : cosine similarity
word_sim_gnews <- d_wordEm_gnews_dist # similarity
word_sim_gnews$wordVec <- rescale(word_sim_gnews$wordVec) # rescale between 0 and 1

### word embedding-based similarity - 2 (word2vec, wiki) : cosine similarity
word_sim_wiki <- d_wordEm_wiki_dist # similarity
word_sim_wiki$wordVec <- rescale(word_sim_wiki$wordVec) # rescale between 0 and 1

### word embedding-based similarity - 1 (gpt3) : cosine similarity
word_sim_gpt3 <- d_wordEm_gpt3_dist # similarity
word_sim_gpt3$wordVec <- rescale(word_sim_gpt3$wordVec) # rescale between 0 and 1

emotionPair <- d_pairwise_dist$emotionPair

d_sim_combined_tmp1 = merge(pair_sim, feature_sim, by.x = "emotionPair", by.y = "emotionPair")
d_sim_combined_tmp2 = merge(d_sim_combined_tmp1, word_sim_gnews, by.x = "emotionPair", by.y = "emotionPair")
d_sim_combined_tmp3 = merge(d_sim_combined_tmp2, word_sim_wiki, by.x = "emotionPair", by.y = "emotionPair")
d_sim_combined_allPairs = merge(d_sim_combined_tmp3, word_sim_gpt3, by.x = "emotionPair", by.y = "emotionPair")
colnames(d_sim_combined_allPairs) = c("emotionPair", "pair_sim", "feature_sim", "word_sim_gnews", "word_sim_wiki", "word_sim_gpt3")
View(d_sim_combined_allPairs)

# removing the same concept pairs
same_concept_pairs <- c('amusement_amusement', 'anger_anger', 'annoyance_annoyance', 'anxiety_anxiety', 'apprehension_apprehension', 'awe_awe', 'awkwardness_awkwardness', 
  'boredom_boredom', 'calmness_calmness', 'comfortableness_comfortableness', 'confusion_confusion', 'contempt_contempt', 'contentment_contentment', 
  'despair_despair', 'desperation_desperation', 'devastation_devastation', 'disappointment_disappointment', 'discomfortableness_discomfortableness', 
  'disgust_disgust', 'elation_elation', 'embarrassment_embarrassment', 'envy_envy', 'exasperation_exasperation', 'excitement_excitement', 'fear_fear', 
  'frustration_frustration', 'fury_fury', 'gratefulness_gratefulness', 'guilt_guilt', 'happiness_happiness', 'hope_hope', 'hopelessness_hopelessness', 
  'horror_horror', 'hurt_hurt', 'inspiration_inspiration', 'jealousy_jealousy', 'joy_joy', 'longing_longing', 'love_love', 'nostalgia_nostalgia', 
  'peace_peace', 'pride_pride', 'protectiveness_protectiveness', 'regret_regret', 'relaxedness_relaxedness', 'relief_relief', 'resentment_resentment', 
  'romanticness_romanticness', 'sadness_sadness', 'satisfaction_satisfaction', 'sereneness_sereneness', 'shame_shame', 'surprise_surprise', 'sympathy_sympathy', 
  'terror_terror', 'triumph_triumph', 'unhappiness_unhappiness', 'worry_worry')

d_sim_combined_diffPairs <- subset(d_sim_combined_allPairs, !(emotionPair %in% same_concept_pairs))
d_sim_combined_samePairs <- subset(d_sim_combined_allPairs, (emotionPair %in% same_concept_pairs))

d_sim_combined <- d_sim_combined_diffPairs # combined dataset

# Data in vector and matrix formats for visualization and permutation test
d_sim_combined_tmp <- d_sim_combined%>% 
  separate(emotionPair, c('emotion1', 'emotion2'))

## Pairwise
vec_pair_sim <- d_sim_combined_tmp$pair_sim
mat_pair_sim <- df_to_pw_mat(d_sim_combined_tmp, from = "emotion1", to = "emotion2", value = "pair_sim")
mat_pair_sim_vis <- mat_pair_sim
diag(mat_pair_sim_vis) <- 1

## Appraisal features
vec_feature_sim <- d_sim_combined$feature_sim
mat_feature_sim <- df_to_pw_mat(d_sim_combined_tmp, from = "emotion1", to = "emotion2", value = "feature_sim")
mat_feature_sim_vis <- mat_feature_sim
diag(mat_feature_sim_vis) <- 1

## Word embedding - 1 (wiki)
vec_word_sim_wiki <- d_sim_combined$word_sim_wiki
mat_word_sim_wiki <- df_to_pw_mat(d_sim_combined_tmp, from = "emotion1", to = "emotion2", value = "word_sim_wiki")
mat_word_sim_wiki_vis <- mat_word_sim_wiki
diag(mat_word_sim_wiki_vis) <- 1

## Word embedding - 2 (gnews)
vec_word_sim_gnews <- d_sim_combined$word_sim_gnews
mat_word_sim_gnews <- df_to_pw_mat(d_sim_combined_tmp, from = "emotion1", to = "emotion2", value = "word_sim_gnews")
mat_word_sim_gnews_vis <- mat_word_sim_gnews
diag(mat_word_sim_gnews_vis) <- 1

## Word embedding - 3 (GPT3)
vec_word_sim_gpt3 <- d_sim_combined$word_sim_gpt3
mat_word_sim_gpt3 <- df_to_pw_mat(d_sim_combined_tmp, from = "emotion1", to = "emotion2", value = "word_sim_gpt3")
mat_word_sim_gpt3_vis <- mat_word_sim_gpt3
diag(mat_word_sim_gpt3_vis) <- 1
```

# Visualization: Similarity matrix heatmap

```{r}
# Visualization (heatmap)
superheat(mat_pair_sim_vis, 

          # place dendrograms on columns and rows 
          row.dendrogram = T, 
          col.dendrogram = T,
          
          # make gridlines white for enhanced prettiness
          grid.hline.col = "white",
          grid.vline.col = "white",
          
          # change the size of the label text
          left.label.text.size = 2,
          bottom.label.text.size = 2,
          
          # rotate bottom label text
          bottom.label.text.angle = 90,
          
          # legend.breaks = c(0.25, 0.5, 0.75, 1)
          )
```

# RSA correlation between similarity measures (#fin_checked)

```{r rsa_corr,echo=FALSE}

# Correlation (spearman)

## between pairwise and feature-based similarity
cor(d_sim_combined$pair_sim, d_sim_combined$feature_sim, method = "spearman")

## between word embedding-based similarity of different NLP models
cor(d_sim_combined$word_sim_gnews, d_sim_combined$word_sim_wiki, method = "spearman") # word2vec (Wikipedia) ~ word2vec (google news)
cor(d_sim_combined$word_sim_gpt3, d_sim_combined$word_sim_wiki, method = "spearman") # word2vec (Wikipedia) ~ GPT3
cor(d_sim_combined$word_sim_gpt3, d_sim_combined$word_sim_gnew, method = "spearman") # word2vec (Google news) ~ GPT3

## between pairwise and word embedding-based similarity
cor(d_sim_combined$word_sim_gnews, d_sim_combined$pair_sim, method = "spearman") # pairwise ~ word2vec (Wikipedia)
cor(d_sim_combined$word_sim_wiki, d_sim_combined$pair_sim, method = "spearman") # pairwise ~ word2vec (Google news)
cor(d_sim_combined$word_sim_gpt3, d_sim_combined$pair_sim, method = "spearman") # pairwise ~ GTP3
```

# Permutation test (#fin_checked for all chunks below)

We can see that the results of this test are wildly significant. However, if are stimuli are just s small sample from a large population, we may want a p-value that reflects the dependencies in the similarity matrices. To achieve this, we can instead use a permutation test. As with any permutation test, it is important that we permute at the level of independent observations. In this case, that means permuting the rows and columns of one of our similarity matrices with respect to the other.


```{r}
# permutation test: correlation between pairwise and feature-based similarity

# set random seed for reproducible results
set.seed(1)
nperm <- 5000 # set permutation count
nppl <- dim(mat_feature_sim)[1] # number of target people
permcor <- rep(NA,nperm) # preallocate results
for (i in 1:nperm){
  sel <- sample(nppl) # permutation vector
  rnsim <- squareform(mat_feature_sim[sel,sel]) # permute matrix and re-vectorize neural similarity
  permcor[i] <- cor(rnsim,vec_pair_sim) # calculate permuted correlation
}

# calculate p-value
mean(abs(permcor) > cor(vec_feature_sim, vec_pair_sim))

# visualize
hist(abs(permcor),xlim=c(0,.9),main="Permuted null versus actual correlation")
abline(v=cor(vec_feature_sim, vec_pair_sim),col="red",lwd=2)
```

```{r}
# permutation test: correlation between pairwise and word embedding-based (wiki) similarity 

# set random seed for reproducible results
set.seed(1)
nperm <- 5000 # set permutation count
nppl <- dim(mat_word_sim_wiki)[1] # number of target people
permcor <- rep(NA,nperm) # preallocate results
for (i in 1:nperm){
  sel <- sample(nppl) # permutation vector
  rnsim <- squareform(mat_word_sim_wiki[sel,sel]) # permute matrix and re-vectorize neural similarity
  permcor[i] <- cor(rnsim,vec_pair_sim) # calculate permuted correlation
}

# calculate p-value
mean(abs(permcor) > cor(vec_word_sim_wiki, vec_pair_sim))

# visualize
hist(abs(permcor),xlim=c(0,.4),main="Permuted null versus actual correlation")
abline(v=cor(vec_word_sim_wiki,vec_pair_sim),col="red",lwd=2)
```

```{r}
# permutation test: correlation between pairwise and word embedding-based (Google news) similarity 

# set random seed for reproducible results
set.seed(1)
nperm <- 5000 # set permutation count
nppl <- dim(mat_word_sim_gnews)[1] # number of target people
permcor <- rep(NA,nperm) # preallocate results
for (i in 1:nperm){
  sel <- sample(nppl) # permutation vector
  rnsim <- squareform(mat_word_sim_gnews[sel,sel]) # permute matrix and re-vectorize neural similarity
  permcor[i] <- cor(rnsim,vec_pair_sim) # calculate permuted correlation
}

# calculate p-value
mean(abs(permcor) > cor(vec_word_sim_gnews, vec_pair_sim))

# visualize
hist(abs(permcor),xlim=c(0,.4),main="Permuted null versus actual correlation")
abline(v=cor(vec_word_sim_gnews,vec_pair_sim),col="red",lwd=2)
```

```{r}
# permutation test: correlation between two word embedding-based (Google news and Wiki) similarity 

# set random seed for reproducible results
set.seed(1)
nperm <- 5000 # set permutation count
nppl <- dim(mat_word_sim_wiki)[1] # number of target people
permcor <- rep(NA,nperm) # preallocate results
for (i in 1:nperm){
  sel <- sample(nppl) # permutation vector
  rnsim <- squareform(mat_word_sim_wiki[sel,sel]) # permute matrix and re-vectorize neural similarity
  permcor[i] <- cor(rnsim,vec_word_sim_gnews) # calculate permuted correlation
}

# calculate p-value
mean(abs(permcor) > cor(vec_word_sim_wiki, vec_word_sim_gnews))

# visualize
hist(abs(permcor),xlim=c(0,.8),main="Permuted null versus actual correlation")
abline(v=cor(vec_word_sim_wiki,vec_word_sim_gnews),col="red",lwd=2)
```

# Visualization: RSA correlation between similarity measures (#fin_checked)

```{r rsa_corr_vis,echo=FALSE}
# Plots

plot(d_sim_combined$feature_sim, d_sim_combined$pair_sim, 
     xlab = "Feature-based similarity", ylab="Pairwise similarity", 
     xlim = c(0, 1), ylim = c(0,1), pch = 20 , cex=0.7)
abline(lm(d_sim_combined$pair_sim ~ d_sim_combined$feature_sim),col="red",lwd=2)

plot(d_sim_combined$word_sim_gnews, d_sim_combined$pair_sim, 
     xlab = "Word embedding-based similarity (Word2vec; Google news)", ylab="Pairwise similarity", 
     xlim = c(0, 1), ylim = c(0,1), pch = 20 , cex=0.7)
abline(lm(d_sim_combined$pair_sim ~ d_sim_combined$word_sim_gnews),col="red",lwd=2)

plot(d_sim_combined$word_sim_wiki, d_sim_combined$pair_sim, 
     xlab = "Word embedding-based similarity (Word2vec; Wikipedia)", ylab="Pairwise similarity", 
     xlim = c(0, 1), ylim = c(0,1), pch = 20 , cex=0.7)
abline(lm(d_sim_combined$pair_sim ~ d_sim_combined$word_sim_wiki),col="red",lwd=2)

plot(d_sim_combined$word_sim_gpt3, d_sim_combined$pair_sim, 
     xlab = "Word embedding-based similarity (GPT3)", ylab="Pairwise similarity", 
     xlim = c(0, 1), ylim = c(0,1), pch = 20 , cex=0.7)
abline(lm(d_sim_combined$pair_sim ~ d_sim_combined$word_sim_gpt3),col="red",lwd=2)

plot(d_sim_combined$word_sim_gnews, d_sim_combined$word_sim_wiki, 
     xlab = "Word embedding-based similarity (Word2vec; Google news)", ylab="Word embedding-based similarity (Word2vec; Wikipedia)", 
     xlim = c(0, 1), ylim = c(0,1), pch = 20 , cex=0.7)
abline(lm(d_sim_combined$word_sim_wiki ~ d_sim_combined$word_sim_gnews),col="red",lwd=2)
```

# Data for multiple regression (residuals ~ difference scores): residuals (#fin_checked)

```{r multiReg_data,echo=FALSE}

# different emotion pairs
emotionPair_diff <- d_sim_combined$emotionPair

# Data
pair_sim <- d_sim_combined$pair_sim
feature_sim <- d_sim_combined$feature_sim
word_sim_gnews <- d_sim_combined$word_sim_gnews
word_sim_wiki <- d_sim_combined$word_sim_wiki
word_sim_gpt3 <- d_sim_combined$word_sim_gpt3

# Get residual (1) from the regression model pairwise ~ wordembedding (Google news)
x_plot <- word_sim_gnews
y_plot <- pair_sim

df_gnews <- cbind(as.data.frame(emotionPair_diff), as.data.frame(x_plot), as.data.frame(y_plot))
rownames(df_gnews) <- emotionPair_diff

model_1 <- lm(y_plot ~ x_plot, data = df_gnews)   # regression
residuals_1 <- model_1$residuals
df_residuals_1 <- as.data.frame(residuals_1)
df_residuals_1 <- tibble::rownames_to_column(df_residuals_1, "emotionPair")

# Get residual(2) from the regression model pairwise ~ wordembedding (Wikipedia)
x_plot <- word_sim_wiki
y_plot <- pair_sim

df_wiki <- cbind(as.data.frame(emotionPair_diff), as.data.frame(x_plot), as.data.frame(y_plot))
rownames(df_wiki) <- emotionPair_diff

model_2 <- lm(y_plot ~ x_plot, data = df)   # regression
residuals_2 <- model_2$residuals
df_residuals_2 <- as.data.frame(residuals_2)
df_residuals_2 <- tibble::rownames_to_column(df_residuals_2, "emotionPair")

# Get residual(3) from the regression model pairwise ~ word embedding (GPT3)
x_plot <- word_sim_gpt3
y_plot <- pair_sim

df_gpt3 <- cbind(as.data.frame(emotionPair_diff), as.data.frame(x_plot), as.data.frame(y_plot))
rownames(df_gpt3) <- emotionPair_diff

model_3 <- lm(y_plot ~ x_plot, data = df_gpt3)   # regression
residuals_3 <- model_3$residuals
df_residuals_3 <- as.data.frame(residuals_3)
df_residuals_3 <- tibble::rownames_to_column(df_residuals_3, "emotionPair")

# Get residual(4) from the regression model pairwise ~ feature
x_plot <- feature_sim
y_plot <- pair_sim

df_feature <- cbind(as.data.frame(emotionPair_diff), as.data.frame(x_plot), as.data.frame(y_plot))
rownames(df_feature) <- emotionPair_diff

model_4 <- lm(y_plot ~ x_plot, data = df_feature)   # regression
residuals_4 <- model_4$residuals
df_residuals_4 <- as.data.frame(residuals_4)
df_residuals_4 <- tibble::rownames_to_column(df_residuals_4, "emotionPair")

# Combine emotion feature diff and residuals per pair
d_featureDiff_residual_tmp1 = merge(d_featureDiff, df_residuals_1, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_residual_tmp2 = merge(d_featureDiff_residual_tmp1, df_residuals_2, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_residual_tmp3 = merge(d_featureDiff_residual_tmp2, df_residuals_3, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_residual = merge(d_featureDiff_residual_tmp3, df_residuals_4, by.x = "emotionPair", by.y = "emotionPair")

d_featureDiff_residual <- d_featureDiff_residual %>%
  relocate(residuals_1, .after = emotion2) %>%
  relocate(residuals_2, .after = residuals_1) %>%
  relocate(residuals_3, .after = residuals_2) %>%
  relocate(residuals_4, .after = residuals_3)
  
# View(d_featureDiff_residual)
```

# Data for multiple regression (residuals ~ difference scores): apprasial feature difference score (#fin_checked)

```{r dat_feature_diff,echo=FALSE}
## Calculating difference scores

# Set-up

emotion_features = colnames(d_appraisal_scaled[2:15])
emotion_concepts = d_appraisal_scaled$emotionConcept
emotion_pairs = combn(emotion_concepts,2)
n_concepts = length(emotion_concepts)
n_pairs = n_concepts * (n_concepts-1) / 2
n_features = length(emotion_features)

d_featureDiff <- data.frame(matrix(ncol = 16, nrow = n_pairs))   # create data frame
colnames(d_featureDiff) <- c("emotion1", "emotion2", emotion_features)   # provide column names

# Add difference scores
for (i in 1:n_features) {   
  emotion_features_current = emotion_features[i]
  
  for (j in 1:n_pairs) {
    emotion1 = emotion_pairs[1, j]
    emotion2 = emotion_pairs[2, j]
    
    d_featureDiff$emotion1[j] <- emotion1 # concept 1
    d_featureDiff$emotion2[j] <- emotion2 # concept 2
    
    d_featureDiff[j, emotion_features_current] = abs(d_appraisal_scaled[d_appraisal_scaled$emotionConcept == emotion1, emotion_features_current] - d_appraisal_scaled[d_appraisal_scaled$emotionConcept == emotion2, emotion_features_current])
  }
}

# Add emotion pair name as a column
d_featureDiff <- d_featureDiff %>%
  rowwise() %>%      # for each row
  mutate(emotionPair = paste(sort(c(emotion1, emotion2)), collapse = "_")) %>%  # sort the teams alphabetically and then combine them separating with -
  ungroup() %>%         # forget the row grouping
  relocate(emotionPair, .after = emotion2)

colnames(d_featureDiff)[4:17] = paste0(colnames(d_featureDiff)[4:17], "_diff")
# View(d_featureDiff)
```

# PCA on appraisal feature difference score (using PCA()) --> will use this! (#fin_checked)

```{r dat_feature_diff_pca,echo=FALSE}

# Get the difference score
d_featureDiff_tmp <- d_featureDiff[, 3:17] %>% 
  remove_rownames %>% 
  column_to_rownames(var="emotionPair")

# Run PCA on the difference score
featureDiff_pca <- PCA(d_featureDiff_tmp, ncp = 5, graph = FALSE) # number of PC = 5
print(featureDiff_pca)

## print the PCA results in table
featureDiff_prcomp <- prcomp(d_featureDiff_tmp, center = TRUE,scale. = TRUE)
# summary(featureDiff_prcomp)
# str(featureDiff_prcomp)
xtable(featureDiff_prcomp)

# print pca results in table

# Choosing the number of components to use
fviz_eig(featureDiff_pca, addlabels = TRUE, ylim = c(0, 50))

# More functions

# get_eigenvalue(res.pca): Extract the eigenvalues/variances of principal components
# fviz_eig(res.pca): Visualize the eigenvalues
# get_pca_ind(res.pca), get_pca_var(res.pca): Extract the results for individuals and variables, respectively.
# fviz_pca_ind(res.pca), fviz_pca_var(res.pca): Visualize the results individuals and variables, respectively.
# fviz_pca_biplot(res.pca): Make a biplot of individuals and variables

feature_PC_loading <- get_pca_ind(featureDiff_pca)$coord
feature_var <- get_pca_var(featureDiff_pca)$contrib

# Make a dataset with PCs
featureDiff_pca5 <- as.data.frame(feature_PC_loading)
colnames(featureDiff_pca5) <- c("PC1", "PC2", "PC3", "PC4", "PC5")
featureDiff_pca5 <- tibble::rownames_to_column(featureDiff_pca5, "emotionConcept")
colnames(featureDiff_pca5)[1] = "emotionPair"

# data

d_featureDiff_pca_gnews = merge(featureDiff_pca5, df_residuals_1, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_pca_wiki = merge(featureDiff_pca5, df_residuals_2, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_pca_gpt3 = merge(featureDiff_pca5, df_residuals_3, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_pca_feature = merge(featureDiff_pca5, df_residuals_4, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_pca_combined_tmp1 = merge(d_featureDiff_pca_gnews, df_residuals_2, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_pca_combined_tmp2 = merge(d_featureDiff_pca_combined_tmp1, df_residuals_3, by.x = "emotionPair", by.y = "emotionPair")
d_featureDiff_pca_combined = merge(d_featureDiff_pca_combined_tmp2, df_residuals_4, by.x = "emotionPair", by.y = "emotionPair")

# Multiple regression

## gnews (= what gnews data cannot explain)
reg_gnews_featurePCA<- lm(residuals_1 ~ PC1 + PC2 + PC3 + PC4 +PC5, data = d_featureDiff_pca_gnews)  
summary(reg_gnews_featurePCA)

## wiki (= what gnews data cannot explain)
reg_wiki_featurePCA<- lm(residuals_2 ~ PC1 + PC2 + PC3 + PC4 +PC5, data = d_featureDiff_pca_wiki) 
summary(reg_wiki_featurePCA)

## gpt3 (= what gnews data cannot explain)
reg_gpt3_featurePCA<- lm(residuals_3 ~ PC1 + PC2 + PC3 + PC4 +PC5, data = d_featureDiff_pca_gpt3) 
summary(reg_gpt3_featurePCA)

## appraisal feature - raw (= what appraisal features cannot explain)
reg_featureRAW_featurePCA<- lm(residuals_4 ~ PC1 + PC2 + PC3 + PC4 +PC5, data = d_featureDiff_pca_feature) 
summary(reg_featureRAW_featurePCA)
```

# Visualization: Multiple regression (residuals ~ difference scores) (#fin_checked)

```{r pca_multiReg_vis_data,echo=FALSE}

## Change data from wide to long

d_featureDiff_residual_plot <- d_featureDiff_pca_combined

colnames(d_featureDiff_residual_plot)[2:6] = c("Valence", "Physical and social danger", "Directionality", "Embodiment", "Attention")

d_featureDiff_residual_long = melt(d_featureDiff_residual_plot, 
                                   id.vars=c("emotionPair", "residuals_1", "residuals_2", "residuals_3", "residuals_4"), 
                                   variable.name="emotion_feature_pca",
                                   value.name="diff_score_pca") # Specify id.vars: the variables to keep but not split apart on
# View(d_featureDiff_residual_long)

## Add group labels

q25_gnews = unname(quantile(d_featureDiff_residual_long$residuals_1)[2])
q50_gnews = unname(quantile(d_featureDiff_residual_long$residuals_1)[3])
q75_gnews = unname(quantile(d_featureDiff_residual_long$residuals_1)[4])

q25_wiki = unname(quantile(d_featureDiff_residual_long$residuals_2)[2])
q50_wiki = unname(quantile(d_featureDiff_residual_long$residuals_2)[3])
q75_wiki = unname(quantile(d_featureDiff_residual_long$residuals_2)[4])

q25_gpt3 = unname(quantile(d_featureDiff_residual_long$residuals_3)[2])
q50_gpt3 = unname(quantile(d_featureDiff_residual_long$residuals_3)[3])
q75_gpt3 = unname(quantile(d_featureDiff_residual_long$residuals_3)[4])

q25_feature = unname(quantile(d_featureDiff_residual_long$residuals_4)[2])
q50_feature = unname(quantile(d_featureDiff_residual_long$residuals_4)[3])
q75_feature = unname(quantile(d_featureDiff_residual_long$residuals_4)[4])

d_featureDiff_residual_long <- d_featureDiff_residual_long %>% 
  mutate(group_gnews = case_when(residuals_1 <=  q25_gnews ~ "1_Underestimated", 
                                 residuals_1 > q75_gnews  ~ "3_Overestimated",
                                 (residuals_1 > q25_gnews & residuals_1 < q75_gnews) ~ "2_Well-predicted")) %>%
  mutate(group_wiki = case_when(residuals_2 <=  q25_wiki ~ "1_Underestimated", 
                                 residuals_2 > q75_wiki  ~ "3_Overestimated",
                                 (residuals_2 > q25_wiki & residuals_2 <= q75_wiki) ~ "2_Well-predicted")) %>%
  mutate(group_gpt3 = case_when(residuals_3 <=  q25_gpt3 ~ "1_Underestimated", 
                                 residuals_3 > q75_gpt3  ~ "3_Overestimated",
                                 (residuals_3 > q25_gpt3 & residuals_3 <= q75_gpt3) ~ "2_Well-predicted")) %>%
  mutate(group_feature = case_when(residuals_4 <=  q25_gpt3 ~ "1_Underestimated", 
                                 residuals_4 > q75_gpt3  ~ "3_Overestimated",
                                 (residuals_4 > q25_gpt3 & residuals_4 <= q75_gpt3) ~ "2_Well-predicted")) %>%
  mutate(group_gnews_4q = case_when(residuals_1 <=  q25_gnews ~ "Q1", 
                                 residuals_1 > q75_gnews  ~ "Q4",
                                 (residuals_1 > q25_gnews & residuals_1 <= q50_gnews) ~ "Q2",
                                 (residuals_1 > q50_gnews & residuals_1 <= q75_gnews) ~ "Q3")) %>%
  mutate(group_wiki_4q = case_when(residuals_2 <=  q25_wiki ~ "Q1", 
                                   residuals_2 > q75_wiki  ~ "Q4",
                                   (residuals_2 > q25_wiki & residuals_2 <= q50_wiki) ~ "Q2",
                                   (residuals_2 > q50_wiki & residuals_2 <= q75_wiki) ~ "Q3")) %>%
  mutate(group_gpt3_4q = case_when(residuals_3 <=  q25_gpt3 ~ "Q1", 
                                   residuals_3 > q75_gpt3  ~ "Q4",
                                   (residuals_3 > q25_gpt3 & residuals_3 <= q50_gpt3) ~ "Q2",
                                   (residuals_3 > q50_gpt3 & residuals_3 <= q75_gpt3) ~ "Q3")) %>%
  mutate(group_feature_4q = case_when(residuals_4 <=  q25_gpt3 ~ "Q1", 
                                   residuals_4 > q75_gpt3  ~ "Q4",
                                   (residuals_4 > q25_gpt3 & residuals_4 <= q50_gpt3) ~ "Q2",
                                   (residuals_4 > q50_gpt3 & residuals_4 <= q75_gpt3) ~ "Q3"))


d_featureDiff_residual_long$group_gnews <- factor(d_featureDiff_residual_long$group_gnews)
d_featureDiff_residual_long$group_wiki <- factor(d_featureDiff_residual_long$group_wiki)
d_featureDiff_residual_long$group_gpt3 <- factor(d_featureDiff_residual_long$group_gpt3)
d_featureDiff_residual_long$group_feature <- factor(d_featureDiff_residual_long$group_feature)

## Make data with mean values  

d_featureDiff_residual_plot_gnews <- d_featureDiff_residual_long %>%
  group_by(group_gnews, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_gnews)

d_featureDiff_residual_plot_wiki <- d_featureDiff_residual_long %>%
  group_by(group_wiki, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_wiki)

d_featureDiff_residual_plot_gpt3 <- d_featureDiff_residual_long %>%
  group_by(group_gpt3, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_gpt3)

d_featureDiff_residual_plot_feature <- d_featureDiff_residual_long %>%
  group_by(group_feature, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_feature)

d_featureDiff_residual_plot_gnews_4q <- d_featureDiff_residual_long %>%
  group_by(group_gnews_4q, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_gnews_4q)

d_featureDiff_residual_plot_wiki_4q <- d_featureDiff_residual_long %>%
  group_by(group_wiki_4q, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_wiki_4q)

d_featureDiff_residual_plot_gpt3_4q <- d_featureDiff_residual_long %>%
  group_by(group_gpt3_4q, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_gpt3_4q)

d_featureDiff_residual_plot_feature_4q <- d_featureDiff_residual_long %>%
  group_by(group_feature_4q, emotion_feature_pca) %>%
  summarise_at(vars(diff_score_pca), list(diff_score_mean = mean, diff_score_max = max, diff_score_min = min, diff_score_sd = sd))
View(d_featureDiff_residual_plot_feature_4q)
```

```{r pca_multiReg_vis,echo=FALSE}

# Color panel
nb.cols <- 14
mycolors <- colorRampPalette(brewer.pal(8, "Set1"))(nb.cols)

# Line plots

## Google news - 4 groups

(plot_gnews_4q <- ggplot(d_featureDiff_residual_plot_gnews_4q, aes(x=group_gnews_4q, y=diff_score_mean, group=emotion_feature_pca)) +
  geom_line(aes(color = emotion_feature_pca), size = 1.2) +
  geom_point(aes(color = emotion_feature_pca), size = 2) +
  ylim(-1.5, 1.5) +
  theme_bw() +
  scale_fill_manual(values = mycolors) +
  theme(legend.position="bottom") +
  labs(y = "Difference scores of appraisal feature components\n", x = "\nResidual quartile (Word2vec; Google news)", color = "Appraisal feature components"))

# Wiki - 4 groups

(plot_wiki_4q <- ggplot(d_featureDiff_residual_plot_wiki_4q, aes(x=group_wiki_4q, y=diff_score_mean, group=emotion_feature_pca)) +
  geom_line(aes(color = emotion_feature_pca), size = 1.2) +
  geom_point(aes(color = emotion_feature_pca), size = 2) +
  ylim(-1.5, 1.5) +
  theme_bw() +
  scale_fill_manual(values = mycolors) +
  theme(legend.position="bottom") +
  labs(y = "Difference scores of appraisal feature components\n", x = "\nResidual quartiles (Word2vec; Wikipedia)", color = "Appraisal feature components"))

# GPT3 - 4 groups

(plot_gpt3_4q <- ggplot(d_featureDiff_residual_plot_gpt3_4q, aes(x=group_gpt3_4q, y=diff_score_mean, group=emotion_feature_pca)) +
  geom_line(aes(color = emotion_feature_pca), size = 1.2) +
  geom_point(aes(color = emotion_feature_pca), size = 2) +
  ylim(-1.5, 1.5) +
  theme_bw() +
  scale_fill_manual(values = mycolors) +
  theme(legend.position="bottom") +
  labs(y = "Difference scores of appraisal feature components\n", x = "\nResidual quartiles (GPT3)", color = "Appraisal feature components"))

# Appraisal feature - 4 groups

(plot_feature_4q <- ggplot(d_featureDiff_residual_plot_feature_4q, aes(x=group_feature_4q, y=diff_score_mean, group=emotion_feature_pca)) +
  geom_line(aes(color = emotion_feature_pca), size = 1.2) +
  geom_point(aes(color = emotion_feature_pca), size = 2) +
  ylim(-1.5, 1.5) +
  theme_bw() +
  scale_fill_manual(values = mycolors) +
  theme(legend.position="bottom") +
  labs(y = "Difference scores of appraisal feature components\n", x = "\nResidual quartiles (Appraisal features)", color = "Appraisal feature components")) +
  theme(axis.text = element_text(size = 15), 
        axis.title = element_text(size = 18),
        legend.text = element_text(size = 15))
```
